#!/usr/bin/env python
#
# SUMMARY
#
# Extracts table content of the resolution summary tables from the UN website and converts it into a date file (delimiter: semi-colon)
# Regular sessions only, if an entry refers to several meetings and dates, only one is extracted
# Several typos, missing pieces of information, etc. have been corrected or added
# Source: http://www.un.org/documents/resga.htm

# The original websites are first copied to a local copy, this only has to be done once (by de-commenting the relevant section below)
# To run the code the locations of the directory have to be changed manually (see below)
#
# COMMENTS
# If there is no extra entry for decision-mode (last entry in column is a string) but a number of votes, code recorded as "Vote summary"
#
# Several pieces of information are in the fourth (session 1: third) column (meeting record ID, date, vote summary).
# These are extracted using regular expression or by seperating them first by splitting the cell using white spaces.
# The table structure could in principle also be used to split the fourth column, but several html parser did not produce consistent results. 
# The formatting for the 69th session (and part of 68 and some earlier entries) has changed, there is no white space between the meeting record ID and the date.
# The relevant meeting record IDs were identified manually and a white space inserted to allow the rest of the code to work in the same way
# for all sessions.
#
import urllib
from bs4 import BeautifulSoup
import re

# MANUAL CHANGES TO UPDATE/ADAPT CODE
# The directory name has to be changed manually
data_file_directory="M:/Userdata/Current/Projects/UN Voting Behaviour/Webscraping/Resolutions_summary/"
#
# Session numbers (first_session, final_session) to loop over has to be changed manually
first_session=1
final_session=69

# Output filename (overwriting previous files) has to be changed manually
#output_filename="C:/DATA/Office/Current/Projects/UN Voting Behaviour/Webscraping/all_resolutions.txt"
output_filename="M:/Userdata/Current/Projects/UN Voting Behaviour/Webscraping/all_resolutions_10.txt"
#
## This part copies the original website to a local copy
# url_part1="http://www.un.org/depts/dhl/resguide/r"
# url_part2="_resolutions_table_eng.htm"
# url_part2_68="_resolutions_table_en.htm"
# for session_number in range (first_session, final_session+1) :
#     if session_number<=67:
#           url=url_part1+str(session_number)+url_part2
#     else:
#          url=url_part1+str(session_number)+url_part2_68
#     html = urllib.urlopen(url).read()
#     data_file_name=data_file_directory+str(session_number)+".htm"
#    print "Writing session number: "+str(session_number)
#    data_file=open(data_file_name, "w")
#     data_file.write(html)
#     data_file.close
# 

# Create output file (overwrite) and print column names
output=open(output_filename, "w")
print >> output, "res_id;location;record_id;date;year;recorded;votes;yes;no;abstain;draft;topic"
output.close()

## SETTING UP MAIN PART

# Compiling regular expressions (sometimes the vote record is put in brackets [e.g., "(102-12-2)"])
votes_regex=re.compile(r'([\s|\(][0-9]{1,3}-[0-9]{1,3}-[0-9]{1,3})')
# Date sometimes comes with abbreviated months (e.g., "Sept.") and sometimes with the full name (e.g., "December", "September", "May")
date_regex=re.compile('(\s[0-9]{1,2}\s[a-zA-Z]{3,9}[\.]?[\s]?[0-9]{4})')

# Dictionary to add whitespaces to fourth column in the 68th and 69th and some other sessions (between meeting ID and date)
addwhitespace_dict={"A/53/PV.107": "A/53/PV.107  ",
                    "A/57/PV.86": "A/57/PV.86  ",
                    "A/69/PV.105": "A/69/PV.105  ",
                    "A/69/PV.103": "A/69/PV.103  ",
                    "A/69/PV.102": "A/69/PV.102  ",
                    "A/69/PV.101": "A/69/PV.101  ",
                    "A/69/PV.100": "A/69/PV.100  ",
                    "A/69/PV.99": "A/69/PV.99  ",
                    "A/69/PV.98": "A/69/PV.98  ",
                    "A/69/PV.97": "A/69/PV.97  ",
                    "A/69/PV.96": "A/69/PV.96  ",
                    "A/69/PV.93": "A/69/PV.93  ",
                    "A/69/PV.92": "A/69/PV.92  ",
                    "A/69/PV.91": "A/69/PV.91  ",
                    "A/69/PV.90": "A/69/PV.90  ",
                    "A/69/PV.89": "A/69/PV.89  ",
                    "A/69/PV.86": "A/69/PV.86  ",
                    "A/69/PV.84": "A/69/PV.84  ",
                    "A/69/PV.81": "A/69/PV.81  ",
                    "A/69/PV.80": "A/69/PV.80  ",
                    "A/69/PV.79": "A/69/PV.79  ",
                    "A/69/PV.78": "A/69/PV.78  ",
                    "A/69/PV.76": "A/69/PV.76  ",
                    "A/69/PV.75": "A/69/PV.75  ",
                    "A/69/PV.73": "A/69/PV.73  ",
                    "A/69/PV.72": "A/69/PV.72  ",
                    "A/69/PV.71": "A/69/PV.71  ",
                    "A/69/PV.70": "A/69/PV.70  ",
                    "A/69/PV.69": "A/69/PV.69  ",
                    "A/69/PV.68": "A/69/PV.68  ",
                    "A/69/PV.67": "A/69/PV.67  ",
                    "A/69/PV.65": "A/69/PV.65  ",
                    "A/69/PV.64": "A/69/PV.64  ",
                    "A/69/PV.63": "A/69/PV.63  ",
                    "A/69/PV.62": "A/69/PV.62  ",
                    "A/69/PV.61": "A/69/PV.61  ",
                    "A/69/PV.58": "A/69/PV.58  ",
                    "A/69/PV.57": "A/69/PV.57  ",
                    "A/69/PV.55": "A/69/PV.55  ",
                    "A/69/PV.51": "A/69/PV.51  ",
                    "A/69/PV.48": "A/69/PV.48  ",
                    "A/69/PV.38": "A/69/PV.38  ",
                    "A/69/PV.36": "A/69/PV.36  ",
                    "A/69/PV.30": "A/69/PV.30  ",                    
                    "A/69/PV.22": "A/69/PV.22  ",
                    "A/68/PV.109": "A/68/PV.109  ",
                    "A/68/PV.108": "A/68/PV.108  ",
                    "A/68/PV.107": "A/68/PV.107  ",
                    "A/68/PV.105": "A/68/PV.105  ",
                    "A/68/PV.104": "A/68/PV.104  ",
                    "A/68/PV.100": "A/68/PV.100  ",
                    "A/68/PV.99" : "A/68/PV.99  ",
                    "A/68/PV.98": "A/68/PV.98  ",
                    "A/68/PV.97": "A/68/PV.97  ",
                    "A/68/PV.91" : "A/68/PV.91  ",
                    "A/68/PV.90" : "A/68/PV.90  ",
                    "A/68/PV.87": "A/68/PV.87  ",
                    "A/68/PV.86": "A/68/PV.86  ",
                    "A/68/PV.85": "A/68/PV.85  ",
                    "A/68/PV.84" : "A/68/PV.84  ",
}

# Definitions for recoding of the date
abbr_months_list=[]
abbr_regex=re.compile('[0-9]{1,2}\s([a-zA-Z]{3,9}\.)[\s]?[0-9]{4}')
abbr_dict={"Jan." : "January",
           "Feb.": "February",
           "Mar.": "March",
           "Apr.": "April",
           "Jun.": "June",
           "Aug.": "August",
           "Sept.": "September",
           "Oct.": "October",
           "Nov.": "November",
           "Dec.": "December"}

# Lists for missing data/report
missing_date_list=[]
missing_year_list=[]
missing_res_id_list=[]
wrong_dateformat_list=[]
missing_consec_number_list=[]
not_assigned_list=[]
short_row_list=[]

## START OF THE MAIN PART
count=0
for session_number in range (first_session, final_session+1) :
     data_file_name=data_file_directory+str(session_number)+".htm"
     html = open(data_file_name, 'r')
     soup = BeautifulSoup(html, "html5lib")
     table = soup.find("table", {"class":"tablefont"})
     datasets = []
     for row in table.find_all("tr")[2:]:
          dataset = [td.get_text() for td in row.find_all("td")]
          datasets.append(dataset)
     
     # Session number one has one column less (added third column after session number 1)
     if session_number<2:
           column=2
     else:
           column=3
     
     for row in datasets:               
          # Skip empty rows, but print previous resolution_id (assumes no empty row at the beginning)

          if len(row)<5:
                short_row_list.append(resolution_id)
                continue
          
          ## DATA PREPARATION
          
          # Removing non-ascii unicode characters, newlines and redundant white spaces
          row[0]=row[0].replace('\n', '').replace('\r', '')
          row[1]=row[1].replace('\n', '').replace('\r', '').replace('                            ', '')         
          row[column]=row[column].encode("ascii", "ignore")
          row[column+1]=row[column+1].encode("ascii", "ignore")
          row[-1]=row[-1].encode("ascii", "ignore")  
          row[column]=row[column].replace('\n', '').replace('\r', '')
          row[column+1]=row[column+1].replace('\n', '').replace('\r', '')
          row[-1]=row[-1].replace('\n', ' ').replace('\r', '').replace('\t', '').replace('   ', ' ').replace('  ', ' ')
          
          # Formatting of cell is lost and there are no whitespaces between meeting record ID and date in session 69, several entries in session 68
          # and some others troughout
          for k, v in addwhitespace_dict.iteritems():
                row[column]=row[column].replace(k, v)          
          # A/69/PV.4 and A/69/PV.3 are ambigious, because they are contained in A/69/PV.48, etc.
          if "September 2014" in row[column] and session_number==69: 
                row[column]=row[column].replace("A/69/PV.4", "A/69/PV.4  ").replace("A/69/PV.3", "A/69/PV.3  ")

                    
          ## CODING OF VARIABLES
          
          # Double white space used to split up second/third column, which contains several pieces of information (record ID, date, vote, recorded)
          # In some instances, there are no double white spaces, however, so they are added before known expressions (vote, date) using regular expressions
          # Using regular expressions directly to extract all information would be more complicated because the meeting record can have various forms
          # Regular expressions are used to directly extract information with known form (vote, date)
          votes=""
          yes="Missing"
          no="Missing"
          abstain="Missing"
          recorded=""
          record_id=""
          date=""
          year= ""
          if " without vote" in row[column]:
              row[column]=row[column].replace(" without vote", "  without vote")
          if " without a vote" in row[column]:
              row[column]=row[column].replace(" without a vote", "  without vote")
          match=re.search(votes_regex, row[column])
          if match:
              row[column]=re.sub(votes_regex, " "+str(match.group()), row[column])
              votes=str(match.group())
              votes=votes.strip(" ")
              votes=votes.strip("(") 
          match=re.search(date_regex, row[column])
          if match:
               row[column]=re.sub(date_regex, " "+str(match.group()), row[column])          
               date=str(match.group())
               date=date.strip(" ")
               year=date[-4:]        
          if "  " in row[column]:
                row[column]=row[column].split("  ")     
          # Removing empty elements due to a) extraction or b) splitting up in the previous step
          row[column]=filter(None, row[column])

          # Extracting record_id (missing for A/RES/41/177A-D, which only has date and subsequently only one sub-element)
          if len(row[column])>1:
                record_id=row[column][0]
          else:
                record_id="Missing"
          
          if len(row[column])>1: 
               # Coding decision mode
               recorded=row[column][-1]
               # If not split up properly, the last entry can contain both date and decision mode
               if "unanimously" in recorded:
                      recorded="unanimously"
               elif "withoutvote" in recorded:
                      recorded="without vote"
               elif "without vote" in recorded:
                      recorded="without vote"
               elif "non-recorded" in recorded:
                      recorded="non-recorded"                      
               # If there is no extra entry for decision-mode (last entry in column is a string) but a number of votes, code recorded as "Vote summary"
               else: 
                    search_string=re.compile("([0-9]{1,3}-[0-9]{1,3}-[0-9]{1,3})")
                    if search_string.search(recorded) is not None:
                         recorded="Vote summary"
                    elif votes:
                         recorded="Vote summary"                          
               # Sometimes last entry is the date, this gets coded as "Missing"
               search_string=re.compile("([0-9]{1,2}\s[a-zA-Z]{3,9}[\.]?[\s]?[0-9]{4})")
               if search_string.search(recorded) is not None:
                     recorded="Missing"                              
               search_string=re.compile("([0-9]{1,2}/[0-9]{1,2}/[0-9]{4})")
               if search_string.search(recorded) is not None:
                     recorded="Missing"               
                                         
               # Adding explicit missing code for missing vote summary
               if not votes:
                     votes="Missing" 
          
          # Extracting information (resolution_id, location, draft, topic)
          resolution_id=row[0]
          resolution_id=resolution_id.replace(" ", "")
          location=row[1]
          draft=row[column+1]
          topic=row[-1]
          if "Number was not assigned" in topic:
                not_assigned_list.append(resolution_id)
          
          ## DATA CLEANING
          
          # Remove trailing or redundant whitespaces from output
          topic=re.sub('\s+', ' ', topic)
          draft=re.sub('\s+', ' ', draft)
          location=location.strip()
          draft=draft.strip()
          topic=topic.strip()
          votes=votes.replace(' ', '')
          recorded=recorded.strip()
          
          ## MANUAL CORRECTIONS
          
          # Dealing with unique entries (typos, etc.) 
          # Invalid date in recorded
          if "0-Jan-1900" in recorded:
                   recorded="Missing"
          if "09Dec. 1949" in recorded:
                   recorded="Missing"
          if "Mar. 1988" in recorded:
                   recorded="Missing"
          # Invalid entry in recorded
          if "GA/11390 126 - 3 - 0" in recorded:
                    recorded="Vote Summary"
                    votes=="126-3-0"
          ## Dates are invalid or missing
          # Year is missing/incomplete
          if resolution_id=="A/RES/68/274":
                date="5 June 2014"
          if resolution_id=="A/RES/35/102": 
               date="5 December 1980"
          if resolution_id=="A/RES/60/264": 
               date="28 June 2006"               
          # No whitespace between day and month
          if resolution_id=="A/RES/54/137":
               date="17 December 1999"
          # Cannot be split with replacement above due to ambiguity of meeting record code
          if resolution_id=="A/RES/52/252":
               date="8 September 1998"
          if resolution_id=="A/RES/41/1": 
               date="10 October 1986"
          # Different date format
          if resolution_id=="A/RES/34/2A-B":
               date="21 September 1979"
          if resolution_id=="A/RES/34/65A-D":
               date="29 November 1979"
          if resolution_id=="A/RES/34/9A-E":
               date="1 November 1979"
          if resolution_id=="A/RES/34/7A-D":
                date="25 October 1979"
          if resolution_id=="A/RES/32/4A-C":
                date="25 October 1977"
          if resolution_id=="A/RES/32/21A-B":
                date="28 November 1977"
          if resolution_id=="A/RES/31/6A-K":
               date="26 October 1976"
          if resolution_id=="A/RES/31/16A-B":
               date="23 November 1976"
          if resolution_id=="A/RES/3393(XXX)A-B":
               date="20 November 1975"
          if resolution_id=="A/RES/3211(XXIX)A-B":
               date="31 October 1974"
          if resolution_id=="A/RES/908(X)A-B":
               date="17 November 1955"
          # Invalid/missing date
          if resolution_id=="A/RES/33/60":
                date="14 December 1978"
          if resolution_id=="A/RES/34/90A-C":
               date="12 December 1979"
          if resolution_id=="A/RES/34/92A-E":
               date="12 December 1979"
          if resolution_id=="A/RES/2262(XXII)":
               date="3 November 1967"
          if resolution_id=="A/RES/1126(XI)":
               date="22 February 1957"
          if resolution_id=="A/RES/33/13D":
               date="8 December 1978"
          # Unknown problem
          if resolution_id=="A/RES/42/230":
                date="23 March 1988"
          if resolution_id=="A/RES/42/229A-B":
                date="2 March 1988" 
          if resolution_id=="A/RES/42/229A":
                date="2 March 1988"
          if resolution_id=="A/RES/42/229B":
                date="2 March 1988" 
          # Creating year for cases in which the date was missing
          year=date[-4:]
          
          # Invalid resolution ID
          if resolution_id=="/RES/55/27":
                resolution_id="A/RES/55/27"
          if resolution_id=="/RES/64/105":
                resolution_id="A/RES/64/105"
                
          ## Recoding variables
          
          # Changing to common date format (Day Month(spelled out) Year)                  
          match=re.search(abbr_regex, date)
          if match:
                for k, v in abbr_dict.iteritems():
                     date = date.replace(k, v)
          # Identifying abbreviated months that are not recoded so far (printed out at the end to standard output)
          match=re.search(abbr_regex, date)
          if match:
                 abbr_month=str(match.group(1))
                 if abbr_month not in abbr_months_list:
                     abbr_months_list.append(abbr_month)
          # Some entries have no whitespace between month and year
          search_string=re.compile('([0-9]{1,2})\s([a-zA-Z]{3,9})([0-9]{4})')
          wrongdateformat=re.search(search_string, date)
          if wrongdateformat:
                 wrongdateformat_day=wrongdateformat.group(1)
                 wrongdateformat_month=wrongdateformat.group(2)
                 wrongdateformat_year=wrongdateformat.group(3)
                 date=str(wrongdateformat_day)+" "+str(wrongdateformat_month)+" "+str(wrongdateformat_year)
          # Removing leading zero
          date=date.lstrip("0")
          # Checking common date format   
          search_string=re.compile('([0-9]{1,2})\s([a-zA-Z]{3,9})\s([0-9]{4})')
          datecheck=re.search(search_string, date)
          if not datecheck:
                 wrong_dateformat_list.append(date)          
          
          # Creating seperate variables for yes-no-abstain
          if votes!="Missing":
               search_string=re.compile("([0-9]{1,3})-([0-9]{1,3})-([0-9]{1,3})")
               match=re.search(search_string, votes)
               if match:
                     yes=match.group(1)
                     no=match.group(2)
                     abstain=match.group(3)
          
          ## CHECKING FOR MISSING VALUES
     
          # Checking for missing dates
          if not date:
                missing_date_list.append(resolution_id)
          # Checking for missing year
          if not year:      
                missing_year_list.append(resolution_id)
          # Checking for missing resolution ID
          if not resolution_id:
                missing_res_id_list.append(date)
          
          # Checking consecutive numbers for missing values
          # Resolution ID format changed with the 31st session
          if session_number<31:
               search_string=re.compile("A/RES/([0-9]{1,4})")
               match=re.search(search_string, resolution_id)
               if match:
                     resolution_number=str(match.group(1))
          else:
               search_string=re.compile("A/RES/[0-9]{2}/([0-9]{1,4})")
               match=re.search(search_string, resolution_id)
               if match:
                     resolution_number=str(match.group(1))
          resolution_number=int(resolution_number)
          # First row has no previous resolution number or session number
          if count==0:
                prev_number=resolution_number
                prev_session=session_number
          # First entry after change of session is the highest number
          if session_number!=prev_session:
                prev_number=resolution_number
          prev_session=session_number
          if not ((resolution_number==prev_number) or (resolution_number==prev_number-1)):    
                     missing_consec_number=str(session_number)+"/"+str(resolution_number)
                     missing_consec_number_list.append(missing_consec_number)
          prev_number=resolution_number
          
          ## Printing data to file (appending)
          output=open(output_filename, "a")
          # A/RES/50/125 was not assigned, so it is an empty row (other than the notice of this number not having been assigned)
          if resolution_id!="A/RES/50/125": 
                print >> output, resolution_id+";"+location+";"+record_id+";"+date+";"+year+";"+recorded+";"+votes+";"+yes+";"+no+";"+abstain+";"+draft+";"+topic
          output.close()
          
          print str(count)+": "+str(resolution_id)
          count +=1
## CREATING REPORT ABOUT POTENTIAL PROBLEMS
print "Rows processed: "+str(count)

if short_row_list: 
     print "Possible problem (short row) after resolution: "
     for element in short_row_list:
            print element
            
if abbr_months_list: 
     print "\nAbbreviated months currently not recoded into full name:"
     for element in abbr_months_list:
            print element
else:
     print "No problems with abbreviated months"

if wrong_dateformat_list: 
     print "\nDates not following common standard:"
     for element in wrong_dateformat_list:
            print element
else:
     print "No problems with deviating date formats"

if missing_date_list: 
     print "\nResolution ID for entries with missing date:"
     for element in missing_date_list:
            print element
else:
     print "No missing entries for date"
if missing_year_list: 
     print "\nResolution ID for entries with missing year:"
     for element in missing_year_list:
            print element
else:
     print "No missing entries for year"
if missing_res_id_list: 
     print "\nDate for entries with missing resolution ID"
     for element in missing_res_id_list:
            print element
else:
     print "No missing entries for resolution ID"
if missing_consec_number_list: 
     print "\nResolution IDs missing from consecutive series of numbers\n (session number format different for sessions 1-30)"
     print "56/299 is listed because resolution numbers 56/499 - 56/300 are missing"
     for element in missing_consec_number_list:
            print element
else:
     print "No missing consecutive numbers for resolution ID"
if not_assigned_list:
     print "\nResolution ID number has not been assigned: "
     for element in not_assigned_list:
            print element     